This module extracts and describes the Census data and Points of Interest to be used for Model Evaluation.
The raw data files were downloaded in geojson format from the American Community Survey 2019 - 5 year survey. The ultimate data source is the US Census Bureau. The data distributor is censusreporter.org
library(tidyverse)
library(kableExtra)
library(sf)
library(stars)
library(ggplot2)
library(ggspatial)
library(viridis)
library(leaflet)
library(classInt)
library(RColorBrewer)Let’s load the Raleigh Corporate Limits. By superimposing the city limits with census data, we can verify that the city is fully covered by the latter.
raleigh_corporate_limits_file = paste0(data_dir, "Corporate_Limits.geojson")
# Load the geojson files
# change the coordinate system to 2264
# filter objects that belong to RALEIGHT
# --------------------------------------------
gj_raleigh <- sf::st_read(raleigh_corporate_limits_file, quiet = TRUE) %>%
st_transform(2264) %>%
filter( SHORT_NAME == 'RAL' )
un_raleigh_buffer = st_buffer(gj_raleigh, dist = 100 ) %>% st_union() %>% st_sf()Load the poverty geojson census data.
pov_root = "acs2019_5yr_B17017_15000US371830529022"
poverty_B17017_raw = st_read( paste0(data_dir, pov_root, "/", pov_root, ".geojson" )
, quiet = TRUE ) %>% st_transform(2264)Extract the regional average poverty rate.
regional_avg_poverty_rate = poverty_B17017_raw %>%
filter( geoid == "31000US39580") %>%
mutate( rate = B17017002 / B17017001) %>%
pull(rate)Impute any missing poverty rate regions with the median rate for the entire Raleigh-Cary Metro Area.
poverty_B17017_raw %>%
select( geoid, name, B17017001, B17017002 ) %>%
filter( name != 'Raleigh-Cary, NC Metro Area') %>%
mutate( poverty_rate = ifelse( B17017001 == 0, regional_avg_poverty_rate , B17017002 / B17017001) ) %>%
mutate( area = st_area(.)) -> gj_povertypal_fun = colorQuantile("Spectral", NULL , n = 6)
gj_poverty_leaf = st_transform(gj_poverty, 4326)
p_popup <- paste0("Poverty: ", format(round(gj_poverty$poverty_rate, 3), nsmall = 3 ), "<br>" ,
"Name: ", gj_poverty$name , "<br>" ,
"Geoid: ", gj_poverty$geoid, "<br>" )
# get quantile breaks. Add .00001 offset to catch the lowest value
breaks_qt <- classIntervals(c(min(gj_poverty$poverty_rate) - .00001,
gj_poverty$poverty_rate), n = 6, style = "quantile")
breaks_qt## style: quantile
## [-1e-05,0.02073365) [0.02073365,0.04442344) [0.04442344,0.07557252)
## 104 104 104
## [0.07557252,0.1113514) [0.1113514,0.1808219) [0.1808219,0.6422414]
## 104 104 105
leaflet( gj_poverty_leaf ) %>% addPolygons( stroke = FALSE, # Remove polygon borders
fillColor = ~pal_fun(poverty_rate), # set fill color with function from above and value
fillOpacity = 0.5 ,
smoothFactor = 0.5, # make it nicer
popup = p_popup
) %>% addTiles() %>%
addLegend( "bottomright", # Location
colors = brewer.pal(6, "Spectral") ,
labels = paste0("up to ", format(breaks_qt$brks[-1], digits = 2)) ,
title = "Raleigh Poverty rate by Census Block Group"
)Table B23025 contains Employment Status for the Population 16 years and over. The relevant field is Unemployed % from the civilian labor force.
unemp_root = "acs2019_5yr_B23025_15000US371830529022"
unemp_B23025_raw = st_read( paste0(data_dir, unemp_root, "/", unemp_root, ".geojson" )
, quiet = TRUE ) %>% st_transform(2264)Extract the regional average unemployment rate for imputation of regions with undefined unemployment rates.
regional_avg_unemp_rate = unemp_B23025_raw %>%
filter( geoid == "31000US39580") %>%
mutate( rate = B23025005 / B23025001) %>%
pull( rate )Impute any missing poverty rate regions with the median rate for the entire Raleigh-Cary Metro Area.
unemp_B23025_raw %>%
select( geoid, name, B23025005, B23025001 ) %>%
filter( name != 'Raleigh-Cary, NC Metro Area') %>%
mutate( unemployment_rate = ifelse( B23025001 == 0, regional_avg_unemp_rate , B23025005 / B23025001) ) %>%
mutate( area = st_area(.)) -> gj_unempgj_unemp_leaf = st_transform(gj_unemp, 4326)
p_popup <- paste0("Unemp: ", format(round(100 * gj_unemp$unemployment_rate, 1), nsmall = 1 ), "<br>" ,
"Name: ", gj_unemp$name , "<br>" ,
"Geoid: ", gj_unemp$geoid, "<br>" )
# get quantile breaks. Add .00001 offset to catch the lowest value
breaks_qt <- classIntervals(c(min(gj_unemp$unemployment_rate) - .00001,
gj_unemp$unemployment_rate), n = 6, style = "quantile")
breaks_qt## style: quantile
## [-1e-05,0.002762431) [0.002762431,0.01464605) [0.01464605,0.02296588)
## 104 103 105
## [0.02296588,0.03476483) [0.03476483,0.05166586) [0.05166586,0.1609695]
## 104 104 105
leaflet( gj_unemp_leaf ) %>% addPolygons( stroke = FALSE, # Remove polygon borders
fillColor = ~pal_fun(unemployment_rate), # set fill color with function from above and value
fillOpacity = 0.5 ,
smoothFactor = 0.5, # make it nicer
popup = p_popup
) %>% addTiles() %>%
addLegend( "bottomright", # Location
colors = brewer.pal(6, "Spectral") ,
labels = paste0("up to ", format(breaks_qt$brks[-1], digits = 2)) ,
title = "Raleigh Unemployment By Block Group"
)ggplot() + geom_sf(data=gj_unemp , aes( fill = unemployment_rate ) ) +
scale_fill_viridis_c( alpha = .4) +
annotation_map_tile(type="osm", progress = "none", alpha = 0.5 ) Table B19013 contains Median Household Income . The relevant field is Unemployed % from the civilian labor force.
#income_root = "acs2019_5yr_B19013_15000US371830523011" # Smaller Income region
income_root = "acs2019_5yr_B19013_15000US371830529022" # Larger Income region
income_B19013_raw = st_read( paste0(data_dir, income_root, "/", income_root, ".geojson" )
, quiet = TRUE ) %>% st_transform(2264)Extract the regional average unemployment rate for imputation of regions with undefined unemployment rates.
regional_avg_income = 67266 # Median household income for Raleigh city in 2019 inflated adjusted dollarsImpute any missing median household income with the regional median household income for the entire Raleigh-Cary Metro Area.
income_B19013_raw %>% select( geoid, name, B19013001 ) %>%
# filter( str_length(geoid) == 19 ) %>% # Filter out the higher level aggregate regions shown by shorter geoid like Cary, NC, Raleigh, NC, etc.
mutate( income = ifelse( is.na(B19013001) | B19013001 == 0, regional_avg_income , B19013001) ) %>%
mutate( area = st_area(.)) -> gj_incomegj_income_leaf = st_transform(gj_income, 4326)
p_popup <- paste0("Income: $", format(round( gj_income$income, 0), nsmall = 0 ), "<br>" ,
"Name: ", gj_income$name , "<br>" ,
"Geoid: ", gj_income$geoid, "<br>" )
# get quantile breaks. Add .00001 offset to catch the lowest value
breaks_qt <- classIntervals(c(min(gj_income$income) - .00001,
gj_income$income), n = 6, style = "quantile")
breaks_qt## style: quantile
## [14414,44519) [44519,56506) [56506,67266) [67266,84951) [84951,111067)
## 106 106 90 122 106
## [111067,232955]
## 107
leaflet( gj_income_leaf ) %>% addPolygons( stroke = FALSE, # Remove polygon borders
fillColor = ~pal_fun(income), # set fill color with function from above and value
fillOpacity = 0.5 ,
smoothFactor = 0.5, # make it nicer
popup = p_popup
) %>% addTiles() %>%
addLegend( "bottomright", # Location
colors = brewer.pal(6, "Spectral") ,
labels = paste0("up to ", format(breaks_qt$brks[-1], digits = 2)) ,
title = "Raleigh Median Household Income"
)First, we do a visual inspection using leaflet to ensure Raleigh is inside the census region. Then, we’ll run formal checks using st_contains for containment of the city in each census region and st_crosses to confirm census regions have no overlapping areas.
The plot below examines Raleigh and median household income region.
leaflet( gj_income_leaf ) %>%
addPolygons( data = gj_raleigh %>% st_transform( 4326 ) , weight = 2, color = "red", opacity = 0.8) %>%
addPolygons( stroke = FALSE, # Remove polygon borders
fillColor = ~pal_fun(income), # set fill color with function from above and value
fillOpacity = 0.5 ,
smoothFactor = 0.5, # make it nicer
popup = p_popup
) %>%
addTiles() %>%
addLegend( "bottomright", # Location
colors = brewer.pal(6, "Spectral") ,
labels = paste0("up to ", format(breaks_qt$brks[-1], digits = 2)) ,
title = "Raleigh Median Household Income"
)